CIND820 Analytics Project

Controlling a Pandemic: Analyzing Key Factors in COVID-19 Outcomes by Country

Dataset: Our world in COVID-19

In [151]:
%matplotlib inline
import pandas as pd
import numpy as np

#Visualization
#!pip install plotly_express
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

#Modeling and evaluation
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report


sns.set_style("darkgrid")
In [152]:
OWIDdf = pd.read_csv('https://github.com/owid/covid-19-data/raw/master/public/data/owid-covid-data.csv')
In [153]:
OWIDdfc = OWIDdf.copy()

Data Cleaning

In [154]:
OWIDdfc.head()
Out[154]:
iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths new_deaths_smoothed ... gdp_per_capita extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index
0 AFG Asia Afghanistan 2020-02-24 1.0 1.0 NaN NaN NaN NaN ... 1803.987 NaN 597.029 9.59 NaN NaN 37.746 0.5 64.83 0.511
1 AFG Asia Afghanistan 2020-02-25 1.0 0.0 NaN NaN NaN NaN ... 1803.987 NaN 597.029 9.59 NaN NaN 37.746 0.5 64.83 0.511
2 AFG Asia Afghanistan 2020-02-26 1.0 0.0 NaN NaN NaN NaN ... 1803.987 NaN 597.029 9.59 NaN NaN 37.746 0.5 64.83 0.511
3 AFG Asia Afghanistan 2020-02-27 1.0 0.0 NaN NaN NaN NaN ... 1803.987 NaN 597.029 9.59 NaN NaN 37.746 0.5 64.83 0.511
4 AFG Asia Afghanistan 2020-02-28 1.0 0.0 NaN NaN NaN NaN ... 1803.987 NaN 597.029 9.59 NaN NaN 37.746 0.5 64.83 0.511

5 rows × 59 columns

In [155]:
OWIDdfc.describe()
#many nonsensical negative values like -min in new_deaths columns.
Out[155]:
total_cases new_cases new_cases_smoothed total_deaths new_deaths new_deaths_smoothed total_cases_per_million new_cases_per_million new_cases_smoothed_per_million total_deaths_per_million ... gdp_per_capita extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index
count 7.809700e+04 78095.000000 77094.000000 6.864800e+04 68806.000000 77094.000000 77673.000000 77671.000000 76675.000000 68237.000000 ... 72328.000000 49338.000000 72926.000000 73893.000000 57251.000000 56405.000000 36632.000000 66670.000000 75633.000000 72873.000000
mean 7.321902e+05 5384.242205 5383.007991 2.105278e+04 133.844389 118.376661 8819.525145 70.758845 70.731244 202.027804 ... 19132.030611 13.308087 257.501683 7.807451 10.528570 32.652618 50.958063 3.031795 73.154248 0.727473
std 5.100517e+06 33204.194859 32796.389027 1.239046e+05 729.773318 668.640572 17154.290760 173.052172 146.642448 356.636527 ... 19778.031889 19.927580 118.658958 3.956053 10.403022 13.473166 31.765492 2.465274 7.551019 0.150068
min 1.000000e+00 -74347.000000 -6223.000000 1.000000e+00 -1918.000000 -232.143000 0.001000 -2153.437000 -276.825000 0.001000 ... 661.240000 0.100000 79.370000 0.990000 0.100000 7.700000 1.188000 0.100000 53.280000 0.394000
25% 8.260000e+02 2.000000 6.429000 4.200000e+01 0.000000 0.000000 175.055000 0.157000 1.074000 6.173000 ... 4466.507000 0.500000 167.295000 5.290000 1.900000 21.600000 20.859000 1.300000 67.880000 0.602000
50% 9.026000e+03 64.000000 78.429000 2.810000e+02 2.000000 1.143000 1213.970000 6.928000 9.222000 35.193000 ... 12951.839000 2.000000 242.648000 7.110000 6.300000 31.400000 49.839000 2.400000 74.530000 0.748000
75% 9.794100e+04 713.000000 744.143000 2.741000e+03 17.000000 12.857000 8436.081000 62.231500 69.830000 215.996000 ... 27216.445000 21.200000 329.635000 10.080000 19.300000 41.100000 83.241000 3.861000 78.730000 0.848000
max 1.318136e+08 881102.000000 739597.143000 2.860554e+06 17904.000000 14432.857000 159011.195000 8652.658000 2648.773000 2526.571000 ... 116935.600000 77.600000 724.417000 30.530000 44.000000 78.100000 98.999000 13.800000 86.750000 0.957000

8 rows × 54 columns

Negative Data

  • some columns have negative data: newcases columns
  • the codebook does not mention any reasons for negative data so it is assumed to be typoes and filtered out
In [156]:
df = OWIDdfc.copy()
df.new_cases.min()
Out[156]:
-74347.0

Absolute values of columns with negative values

In [160]:
#new_cols = [col for col in df.columns if 'new' in col]
#new_cols
In [161]:
#df[new_cols].describe()
In [162]:
#df[new_cols] = abs(df[new_cols])
In [163]:
#df[new_cols].describe()
In [164]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79667 entries, 0 to 79666
Data columns (total 59 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   iso_code                               79667 non-null  object 
 1   continent                              75781 non-null  object 
 2   location                               79667 non-null  object 
 3   date                                   79667 non-null  object 
 4   total_cases                            78097 non-null  float64
 5   new_cases                              78095 non-null  float64
 6   new_cases_smoothed                     77094 non-null  float64
 7   total_deaths                           68648 non-null  float64
 8   new_deaths                             68806 non-null  float64
 9   new_deaths_smoothed                    77094 non-null  float64
 10  total_cases_per_million                77673 non-null  float64
 11  new_cases_per_million                  77671 non-null  float64
 12  new_cases_smoothed_per_million         76675 non-null  float64
 13  total_deaths_per_million               68237 non-null  float64
 14  new_deaths_per_million                 68395 non-null  float64
 15  new_deaths_smoothed_per_million        76675 non-null  float64
 16  reproduction_rate                      64392 non-null  float64
 17  icu_patients                           8303 non-null   float64
 18  icu_patients_per_million               8303 non-null   float64
 19  hosp_patients                          9944 non-null   float64
 20  hosp_patients_per_million              9944 non-null   float64
 21  weekly_icu_admissions                  726 non-null    float64
 22  weekly_icu_admissions_per_million      726 non-null    float64
 23  weekly_hosp_admissions                 1335 non-null   float64
 24  weekly_hosp_admissions_per_million     1335 non-null   float64
 25  new_tests                              36571 non-null  float64
 26  total_tests                            36314 non-null  float64
 27  total_tests_per_thousand               36314 non-null  float64
 28  new_tests_per_thousand                 36571 non-null  float64
 29  new_tests_smoothed                     41821 non-null  float64
 30  new_tests_smoothed_per_thousand        41821 non-null  float64
 31  positive_rate                          40511 non-null  float64
 32  tests_per_case                         39897 non-null  float64
 33  tests_units                            43204 non-null  object 
 34  total_vaccinations                     6454 non-null   float64
 35  people_vaccinated                      5830 non-null   float64
 36  people_fully_vaccinated                4107 non-null   float64
 37  new_vaccinations                       5475 non-null   float64
 38  new_vaccinations_smoothed              10143 non-null  float64
 39  total_vaccinations_per_hundred         6454 non-null   float64
 40  people_vaccinated_per_hundred          5830 non-null   float64
 41  people_fully_vaccinated_per_hundred    4107 non-null   float64
 42  new_vaccinations_smoothed_per_million  10143 non-null  float64
 43  stringency_index                       68065 non-null  float64
 44  population                             79172 non-null  float64
 45  population_density                     74262 non-null  float64
 46  median_age                             72047 non-null  float64
 47  aged_65_older                          71221 non-null  float64
 48  aged_70_older                          71642 non-null  float64
 49  gdp_per_capita                         72328 non-null  float64
 50  extreme_poverty                        49338 non-null  float64
 51  cardiovasc_death_rate                  72926 non-null  float64
 52  diabetes_prevalence                    73893 non-null  float64
 53  female_smokers                         57251 non-null  float64
 54  male_smokers                           56405 non-null  float64
 55  handwashing_facilities                 36632 non-null  float64
 56  hospital_beds_per_thousand             66670 non-null  float64
 57  life_expectancy                        75633 non-null  float64
 58  human_development_index                72873 non-null  float64
dtypes: float64(54), object(5)
memory usage: 35.9+ MB

Dropping columns

  • columns will be removed based on null values or relevance
  • some columns have less than 10000 non-null values, but may be kept due to high relevance. ie. new vaccinations,
In [165]:
#some of this information is too sparse to analyse, and is somewhat redundant when we are already analysing death rates
df.drop(columns=df.columns[17:25].tolist(), inplace=True)
In [166]:
df_dropcol = df.copy()

Date indexing

In [167]:
df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date')
In [168]:
#Preprocessing
In [169]:
df_world = df[df.location=='World']
df = df[-df.location.isin(['World'])]
In [170]:
odf = df.copy()
odf1 = df.copy()

Exploratory Data Analysis

In [171]:
#creating crude mortality features
odf['crude_mortality'] = (odf.total_deaths/odf.population)
In [172]:
odf.tail()[['location','crude_mortality']]
Out[172]:
location crude_mortality
date
2021-04-01 Zimbabwe 0.000102
2021-04-02 Zimbabwe 0.000103
2021-04-03 Zimbabwe 0.000103
2021-04-04 Zimbabwe 0.000103
2021-04-05 Zimbabwe 0.000103
In [174]:
fig, ax = plt.subplots(figsize=(10,7))
for x, y in enumerate([df_world.new_cases_smoothed]):
    y.plot(ax=ax, linestyle='-')
for x, y in enumerate([df_world.new_deaths_smoothed]):
    y.plot(ax=ax, linestyle='-')
    
ax.legend(fontsize='x-large')
ax.set(xlabel = '', ylabel='Cases', title = 'Worldwide cases in 2020')
Out[174]:
[Text(0, 0.5, 'Cases'),
 Text(0.5, 0, ''),
 Text(0.5, 1.0, 'Worldwide cases in 2020')]
In [175]:
odf_canada = odf[odf.location=='Canada']
In [351]:
fig, ax = plt.subplots(3, 1, figsize=(15,10), sharex=True)
for x, y in enumerate([odf_canada.total_cases]):
    y.plot(ax=ax[0], linestyle='-')
for x, y in enumerate([odf_canada.total_deaths]):
    y.plot(ax=ax[0], linestyle='-', color='r')
for x, y in enumerate([odf_canada.total_deaths]):
    y.plot(ax=ax[1], linestyle='-', color='r')
for x, y in enumerate([odf_canada.stringency_index]):
    y.plot(ax=ax[2], linestyle='-')  

ax[0].set(xlabel = '', ylabel='Population', title = 'Canada: new cases and deaths')
ax[1].set(xlabel = '', ylabel='Population', title = 'Canada: new deaths scaled')
ax[2].set(xlabel = '', ylabel='Stringency Index', title = 'Canada: Stringency Index')

ax[0].legend(loc='upper left', fontsize='x-large')
ax[0].annotate('February 7, 2021',
               xy=('2021-02-11', odf_canada.total_cases['2021-02-25']-20000),
               xycoords = 'data', xytext=('2021-02-01',580000),
               arrowprops=dict(facecolor='black'),
               horizontalalignment='left',
               verticalalignment='top')
ax[0].annotate('First reported case of P.1 Variant',
               xy=('2021-02-11', odf_canada.total_cases['2021-02-25']),
               xycoords = 'data', xytext=('2021-01-15',450000))
Out[351]:
Text(2021-01-15, 450000, 'First reported case of P.1 Variant')
In [470]:
def vtplot(case='Canada'):
    fig, ax = plt.subplots(figsize=(14,7))
    for x, y in enumerate([df[df.location==case].total_cases.dropna()]):
        y.plot(ax=ax, linestyle='-')
    #for x, y in enumerate([df[df.location==case].total_deaths.dropna()]):
    #    y.plot(ax=ax, linestyle='-', color = 'r')
    for x, y in enumerate([df[df.location==case].people_vaccinated.dropna()]):
        y.plot(ax=ax, linestyle='-')
    for x, y in enumerate([df[df.location==case].total_vaccinations.dropna()]):
        y.plot(ax=ax, linestyle='-')

    ax.set_title('Vaccines and cases in '+case, fontsize='xx-large')
    ax.set_ylabel('Population')
    ax.legend(loc = 'upper left', fontsize='x-large')
In [471]:
vtplot('North America')
vtplot('South America')
vtplot('Asia')
vtplot('Africa')
vtplot('Europe')
vtplot('Oceania')
In [451]:
vtplot('India')
In [452]:
vtplot('United States')
In [453]:
vtplot('Brazil')
In [454]:
vtplot('Europe')
In [455]:
vtplot('United Kingdom')
In [457]:
vtplot('Japan')
In [458]:
vtplot('South Korea')
In [460]:
vtplot('Philippines')
In [466]:
vtplot('North America')
In [467]:
vtplot('Asia')
In [179]:
fig, ax = plt.subplots(figsize=(15,10))
continents = ['South America','North America','Asia','Africa','Europe','Oceania']#No Antarctica information available

for i in continents:
    for x, y in enumerate([odf[odf.location==i].total_cases]):
        y.plot(ax=ax, linestyle='-')
ax.set(xlabel = '', ylabel='Population', title = 'Worldwide new cases by continent')  
ax.legend(continents, fontsize='x-large')
Out[179]:
<matplotlib.legend.Legend at 0x207147473c8>
In [475]:
fig, ax = plt.subplots(figsize=(15,9))
continents = ['South America','North America','Asia','Africa','Europe','Oceania']#No Antarctica information available

for i in continents:
    for x, y in enumerate([odf[odf.location==i].total_deaths]):
        y.plot(ax=ax, linestyle='-')
ax.set(xlabel = '', ylabel='Deaths', title = 'Worldwide deaths by continent') 
ax.legend(continents, loc='upper left',fontsize= 'x-large')
Out[475]:
<matplotlib.legend.Legend at 0x207459a2e08>
In [182]:
odf_nocontinent = odf[-odf.location.isin(['World','European Union','North America','South America','Asia','Africa','Europe','Oceania'])] # Removing Continents
In [183]:
odf_totalcases = odf_nocontinent.groupby(odf_nocontinent['location']).agg(['max'])
In [184]:
odf_totalcases.columns = odf_totalcases.columns.droplevel(1)
In [185]:
fig = px.choropleth(odf_totalcases, locations='iso_code',
                    color='total_cases',
                    hover_name=odf_totalcases.index, 
                    color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
In [186]:
fig = px.choropleth(odf_totalcases, locations='iso_code',
                    color='total_deaths',
                    hover_name=odf_totalcases.index, 
                    hover_data=['population_density','stringency_index','human_development_index'],
                    color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()

Median Age

In [187]:
fig = px.choropleth(odf_totalcases, locations='iso_code',
                    color=odf_totalcases.median_age,
                    hover_name=odf_totalcases.index, 
                    color_continuous_scale=px.colors.diverging.curl,
                    color_continuous_midpoint=20)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
In [188]:
odf_ma40 = odf[odf.median_age>40]
odf_ma20 = odf[odf.median_age<20]

top5_ma = odf_totalcases.sort_values(by='median_age', ascending=False).head(5).index.tolist()
bot5_ma = odf_totalcases.sort_values(by='median_age', ascending=True).head(5).index.tolist()
In [706]:
fig, ax = plt.subplots(2,1,figsize=(12,8), sharex=True)

for i in top5_ma:
    for x, y in enumerate([odf[odf.location==i].total_cases_per_million]):
        y.plot(ax=ax[0], linestyle='-')
for i in bot5_ma:
    for x, y in enumerate([odf[odf.location==i].total_cases_per_million]):
        y.plot(ax=ax[0], linestyle='dotted')
        
for i in top5_ma:
    for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
        y.plot(ax=ax[1], linestyle='-')
for i in bot5_ma:
    for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
        y.plot(ax=ax[1], linestyle='dotted')

ax[0].set(xlabel = '', ylabel='Population', title= 'Total Cases per million')
#ax[0].set_title('Total Cases per million',fontsize=20)
ax[0].legend(top5_ma+bot5_ma, fontsize='large')
ax[1].set(xlabel = '', ylabel='Population', title='Total Deaths per million')  
#ax[1].set_title('Total Deaths per million',fontsize=20)
plt.tight_layout()
In [380]:
zf = pd.DataFrame(df.groupby(by=['iso_code', 'location']).agg(['mean']))
In [381]:
zf.reset_index(inplace=True)
In [383]:
zf.columns = zf.columns.droplevel(1)
In [389]:
zf.stringency_index.isnull().sum()
Out[389]:
37
In [393]:
fig = px.choropleth(zf, locations='iso_code',
                    color='stringency_index',
                    hover_name='location', 
                    color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
In [191]:
list_pd = odf_totalcases.sort_values('population_density', ascending=False).head(5).index.tolist()
In [192]:
odf2 = odf_totalcases.copy()
odf2.drop(labels=list_pd, axis=0, inplace=True)
In [699]:
zf.sort_values(by='stringency_index', ascending=False).head(10)[['location','stringency_index','total_deaths_per_million','total_cases_per_million']]
Out[699]:
location stringency_index total_deaths_per_million total_cases_per_million
81 Honduras 87.50 210.67 7635.86
55 Eritrea 86.92 1.78 239.33
206 Venezuela 84.77 21.56 2280.34
108 Libya 82.90 121.99 7663.96
16 Bangladesh 78.38 27.81 1864.83
163 Palestine 78.25 138.92 12855.29
127 Myanmar 77.91 21.99 980.31
11 Azerbaijan 77.03 117.75 8696.86
156 Panama 76.69 575.58 31737.51
39 Colombia 76.54 513.11 17822.71
In [700]:
zf.sort_values(by='stringency_index', ascending=True).head(10)[['location','stringency_index','total_deaths_per_million','total_cases_per_million']].dropna()
Out[700]:
location stringency_index total_deaths_per_million total_cases_per_million
139 Nicaragua 13.64 17.94 612.30
12 Burundi 14.93 0.13 63.44
21 Belarus 19.20 99.89 12064.05
198 Tanzania 23.66 0.33 7.70
197 Taiwan 24.53 0.28 22.32
137 Niger 30.71 3.59 82.63
210 Yemen 33.82 16.96 58.37
In [193]:
fig = px.choropleth(odf2, locations='iso_code',
                    color='population_density',
                    hover_name=odf2.index, 
                    color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()

Hospital beds per thousand

In [612]:
hbdf = odft.sort_values(by='hospital_beds_per_thousand', ascending=False)[['hospital_beds_per_thousand','total_deaths_per_million']].dropna()
In [655]:
fig, host = plt.subplots(figsize=(12,8))
par1 = host.twinx()

host.set_xlim(-5, 165)
host.set_xticks([])
host.set_ylim(-1, 14)
par1.set_ylim(-100, 2500)

host.set_xlabel('countries')
host.set_ylabel('Hospital beds per thousand')
par1.set_ylabel('total deaths per million')
p1 = host.plot(np.array(hbdf.hospital_beds_per_thousand.values), color='g')
p2 = par1.scatter(x=np.arange(0,160),y=np.array(hbdf.total_deaths_per_million.values))
host.legend(labels=['hospital beds per thousand'], loc='upper left', fontsize='x-large')
par1.legend(labels=['total_deaths'], loc='upper right', fontsize='x-large')
Out[655]:
<matplotlib.legend.Legend at 0x20754853648>

Counterintuitively, the trend shows that less hospital capacity is correlated somewhat to a lower total death count. This means mean there are other factors to be considered.

In [194]:
fig = px.choropleth(odf_totalcases, locations='iso_code',
                    color='hospital_beds_per_thousand',
                    hover_name=odf_totalcases.index, 
                    color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()
In [423]:
odf_totalcases[odf_totalcases.total_cases_per_million.between(2000,10000)][['total_deaths_per_million','total_cases_per_million','population','population_density','hospital_beds_per_thousand']].sort_values(by='total_deaths_per_million').dropna()
Out[423]:
total_deaths_per_million total_cases_per_million population population_density hospital_beds_per_thousand
location
Mongolia 3.96 3553.99 3278292.00 1.98 7.00
Uzbekistan 18.85 2503.85 33469199.00 76.13 4.00
Ghana 24.20 2925.37 31072945.00 126.72 0.90
Mozambique 25.02 2182.88 31255435.00 37.73 0.70
Sri Lanka 27.13 4370.89 21413250.00 341.95 3.60
Cameroon 32.06 2159.92 26545864.00 50.88 1.30
South Korea 34.17 2072.01 51269183.00 527.97 12.27
Cuba 38.49 7116.86 11326616.00 110.41 5.20
Kenya 41.73 2593.35 53771300.00 87.32 1.40
Gabon 53.47 8924.27 2225728.00 7.86 6.30
Bangladesh 56.58 3913.06 164689383.00 1265.04 0.80
Myanmar 58.92 2619.22 54409794.00 81.72 0.90
Venezuela 59.01 5892.12 28435943.00 36.25 0.80
Zambia 66.47 4841.67 18383956.00 23.00 2.00
Pakistan 67.56 3151.69 220892331.00 255.57 0.60
Gambia 68.69 2277.93 2416664.00 207.57 1.10
Algeria 70.88 2684.98 43851043.00 17.35 1.90
Japan 72.97 3848.87 126476458.00 347.78 13.05
Equatorial Guinea 74.13 5031.41 1402985.00 45.19 2.10
Djibouti 76.92 8801.60 988002.00 41.28 1.40
Zimbabwe 102.60 2484.97 14862927.00 42.73 1.70
Trinidad and Tobago 103.61 5853.56 1399491.00 266.89 3.00
Nepal 104.20 9548.40 29136808.00 204.43 0.30
Egypt 119.31 2010.39 102334403.00 98.00 1.60
India 119.96 9192.76 1380004385.00 450.42 0.53
Philippines 122.60 7331.54 109581085.00 351.87 1.00
Indonesia 152.88 5622.79 273523621.00 145.72 1.04
Comoros 167.89 4283.60 869595.00 437.35 2.20
In [419]:
pd.set_option('display.float_format', '{:.2f}'.format)
hbpt = odf_totalcases[odf_totalcases.population.between(100000000,200000000)][['total_deaths_per_million','total_cases_per_million','population','population_density','hospital_beds_per_thousand']].sort_values(by='total_deaths_per_million')
In [656]:
hbpt.sort_values(by='hospital_beds_per_thousand', ascending=False)
Out[656]:
total_deaths_per_million total_cases_per_million population population_density hospital_beds_per_thousand
location
Japan 72.97 3848.87 126476458.00 347.78 13.05
Russia 678.72 31096.84 145934460.00 8.82 8.05
Egypt 119.31 2010.39 102334403.00 98.00 1.60
Mexico 1585.32 17464.18 128932753.00 66.44 1.38
Philippines 122.60 7331.54 109581085.00 351.87 1.00
Bangladesh 56.58 3913.06 164689383.00 1265.04 0.80
Ethiopia 26.09 1890.40 114963583.00 104.96 0.30
In [195]:
odf_hb7 = odf[odf.hospital_beds_per_thousand>6]
odf_hb6 = odf[odf.hospital_beds_per_thousand<6]

top5_hbpt = odf_totalcases.sort_values(by='hospital_beds_per_thousand', ascending=False).head(5).index.tolist()
bot5_hbpt = odf_totalcases.sort_values(by='hospital_beds_per_thousand', ascending=True).head(5).index.tolist()
In [359]:
bot5_hbpt
Out[359]:
['Mali', 'Madagascar', 'Guinea', 'Ethiopia', 'Nepal']
In [710]:
fig, ax = plt.subplots(2,1, figsize=(12,8), sharex=True)

for i in top5_hbpt:
    for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
        y.plot(ax=ax[0], linestyle='-')
for i in bot5_hbpt:
    for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
        y.plot(ax=ax[0], linestyle='dotted')

for i in top5_hbpt:
    for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
        y.plot(ax=ax[1], linestyle='-')
for i in bot5_hbpt:
    for x, y in enumerate([odf[odf.location==i].total_deaths_per_million]):
        y.plot(ax=ax[1], linestyle='dotted')

ax[0].set(xlabel = '', ylabel='Population', title = 'Total cases per million')  
ax[0].legend(top5_hbpt+bot5_hbpt, fontsize='large')
ax[1].set(xlabel = '', ylabel='Population', title = 'Total deaths per million')  
Out[710]:
[Text(0, 0.5, 'Population'),
 Text(0.5, 0, ''),
 Text(0.5, 1.0, 'Total deaths per million')]
In [197]:
fig = px.choropleth(odf_totalcases, locations='iso_code',
                    color='extreme_poverty',
                    hover_name=odf_totalcases.index, 
                    color_continuous_scale=px.colors.sequential.deep)
fig.update_geos(fitbounds='locations', visible=False)
fig.show()

Correlations

In [198]:
#Removing redundant columns
#some columns are removed due to multicollinearity
OWIDdf = OWIDdf.drop(columns=['aged_65_older','aged_70_older','iso_code','continent'])
In [663]:
odft=odf_totalcases.copy()
list_ts = [col for col in odft.columns if 'new' in col or 'total' in col or 'people' in col]
odft = odft.drop(columns=['aged_65_older','aged_70_older','iso_code','continent','reproduction_rate','positive_rate','tests_per_case']+list_ts)
In [664]:
odft.corr()
Out[664]:
stringency_index population population_density median_age gdp_per_capita extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index crude_mortality
stringency_index 1.00 0.06 -0.21 0.01 -0.14 -0.26 0.04 0.10 -0.06 0.08 0.42 -0.08 0.00 0.09 0.12
population 0.06 1.00 -0.03 0.04 -0.05 -0.04 -0.00 0.03 -0.12 0.03 0.04 -0.04 -0.02 -0.02 -0.07
population_density -0.21 -0.03 1.00 0.15 0.41 -0.03 -0.18 0.02 -0.06 0.00 0.10 0.32 0.24 0.18 -0.00
median_age 0.01 0.04 0.15 1.00 0.64 -0.70 -0.34 0.16 0.67 0.20 0.79 0.66 0.85 0.90 0.60
gdp_per_capita -0.14 -0.05 0.41 0.64 1.00 -0.51 -0.48 0.18 0.35 -0.09 0.65 0.30 0.68 0.76 0.31
extreme_poverty -0.26 -0.04 -0.03 -0.70 -0.51 1.00 0.19 -0.40 -0.41 -0.19 -0.75 -0.44 -0.75 -0.78 -0.46
cardiovasc_death_rate 0.04 -0.00 -0.18 -0.34 -0.48 0.19 1.00 0.13 -0.17 0.42 -0.04 0.02 -0.47 -0.43 -0.21
diabetes_prevalence 0.10 0.03 0.02 0.16 0.18 -0.40 0.13 1.00 -0.05 0.17 0.47 -0.07 0.26 0.21 -0.02
female_smokers -0.06 -0.12 -0.06 0.67 0.35 -0.41 -0.17 -0.05 1.00 0.20 0.30 0.50 0.52 0.58 0.68
male_smokers 0.08 0.03 0.00 0.20 -0.09 -0.19 0.42 0.17 0.20 1.00 0.33 0.37 0.07 0.11 0.10
handwashing_facilities 0.42 0.04 0.10 0.79 0.65 -0.75 -0.04 0.47 0.30 0.33 1.00 0.45 0.82 0.84 0.44
hospital_beds_per_thousand -0.08 -0.04 0.32 0.66 0.30 -0.44 0.02 -0.07 0.50 0.37 0.45 1.00 0.47 0.56 0.34
life_expectancy 0.00 -0.02 0.24 0.85 0.68 -0.75 -0.47 0.26 0.52 0.07 0.82 0.47 1.00 0.91 0.52
human_development_index 0.09 -0.02 0.18 0.90 0.76 -0.78 -0.43 0.21 0.58 0.11 0.84 0.56 0.91 1.00 0.56
crude_mortality 0.12 -0.07 -0.00 0.60 0.31 -0.46 -0.21 -0.02 0.68 0.10 0.44 0.34 0.52 0.56 1.00
In [671]:
save = sns.heatmap(odft.corr()).set_title('OWID dataset Correlation Heatmap')

save.figure.savefig('heatmap.png')
In [712]:
sns.heatmap(odf_totalcases.corr()[['total_cases_per_million']].sort_values(by='total_cases_per_million', ascending=False),
            annot=True, cmap='BrBG').set_title('Features correlating with total_cases')
Out[712]:
Text(0.5, 1, 'Features correlating with total_cases')
In [713]:
dhmp = sns.heatmap(odf_totalcases.corr()[['total_deaths_per_million']].sort_values(by='total_deaths_per_million', ascending=False), annot=True, cmap='BrBG')
dhmp.set_title('Features correlating with total_deaths')
Out[713]:
Text(0.5, 1, 'Features correlating with total_deaths')
In [ ]:
nowiddfn = odf.select_dtypes(include='float64')
#ppdf = sns.pairplot(nowiddfn.sample(1000))
#ppdf.savefig("pairplotowid.png")

Some of the target variables to consider: reproduction_rate, total_cases_per_million, total_deaths_per_million. Normally we would also consider crude mortality rate or case-fatality rate as well. However, confirmed cases is often underreported AND undertested and as such may be harder to evaluate accurately. It is important to consider that the data being robust and accurate relies heavily on the nation's testing capacity.

Remember that the reproduction rate (R) describes the trajectory of the virus. A value of R = 1 means the amount of new infections and new recoveries are equal; meaning the virus numbers will stagnate. A value of 6.74 means the number of infected is sharply increasing and may lead in a big spike of infections and deaths, depending on government mitigation strategy.

In [ ]:
print(OWIDdf.reproduction_rate.max())
odf[odf.reproduction_rate==OWIDdf.reproduction_rate.max()]

Calculating Crude Mortality/Case-Fatality

  • Crude Mortality and Case-Fatality are other metrics to see how each country mitigates COVID-19.
In [ ]:
odft.info()

South Korea reported one of the largest spikes in the early stages of the pandemic, yet they are one of the countries with the lowest COVID-19 Mortality, which may be attributed to hospital bed capacity.

Italy may have suffered more loss due to their higher median age.

Crude Mortality or Case fatality may be used a generated target feature vs total_deaths_per_million. However, because case-fatality veracity and consistency is heavily dependent on testing capacity of the region, this feature will be discarded, and crude_mortality will be the focus.

In [ ]:
odfc = odf_totalcases.copy()
odfc['crude_mortality'] = (odfc.total_deaths/odfc.population)
In [ ]:
odfc.crude_mortality.describe()
In [ ]:
#if using crude_mortality, belgium actually performs the worst
odfc[odfc.crude_mortality == odfc.crude_mortality.max()][['population','crude_mortality']]
In [ ]:
odfc.sort_values(by='crude_mortality', ascending = False).head(10)[['crude_mortality']]
In [ ]:
odfc.sort_values(by='crude_mortality', ascending = True).head(10)[['crude_mortality']]
In [ ]:
odfc[odfc.index=='Tanzania'][['total_cases','total_deaths','population','crude_mortality']]

Many African Countries did very well in response to the virus.

Visualizations

In [ ]:
odfdt = odfdt[odfdt['total_cases_per_million'].notna()]
odfdt = odfdt[odfdt['population_density'].notna()]
In [ ]:
odf1 = df.groupby(df['location']).agg(['max'])
odf1.columns = odf1.columns.droplevel(1)
odf1.reset_index(level=0, inplace=True)
In [ ]:
odf1.sort_values(by='total_vaccinations', ascending=False).head(10)[['location']].values.tolist()
In [ ]:
def timeplot(loc, case1, case2):
    fig, ax = plt.subplots(figsize=(14,7))
    for x, y in enumerate([df[df.location==loc][case1]]):
        y.plot(ax=ax, linestyle='-')
    for x, y in enumerate([df[df.location==loc][case2]]):
        y.plot(ax=ax, linestyle='-', color = 'r')
    #for x, y in enumerate([odf_canada.total_tests]):
    #    y.plot(ax=ax, linestyle='-')
    #for x, y in enumerate([odf_canada.total_vaccinations]):
    #    y.plot(ax=ax, linestyle='-')

    ax.set_title(''+case1+' vs '+case2, fontsize='xx-large')
    ax.legend(loc = 'upper left', fontsize='x-large')
In [ ]:
timeplot(loc='United States', case1='new_vaccinations_smoothed', case2='total_cases')
In [ ]:
timeplot(loc='India', case1='new_vaccinations_smoothed', case2='total_cases')
In [ ]:
timeplot(loc='Canada', case1='total_cases_per_million', case2='handwashing_facilities')
In [ ]:
 
In [674]:
odf1 = odf1[odf1.population.notna()]
In [675]:
odf1 = odf1[odf1.total_cases.notna()]
In [676]:
odf1 = odf1[odf1.population_density.notna()]
In [677]:
odf1 = odf1[odf1.human_development_index.notna()]
In [678]:
odf1 = odf1[odf1.gdp_per_capita.notna()]
In [ ]:
odf1 = odf1[-odf1.location.isin(['World','European Union','North America','South America','Asia','Africa','Europe','Oceania'])]
In [ ]:
odfpx = odf1.sort_values(by='population_density', ascending=False).head(10)
In [711]:
px.scatter(odf1, x='population',y='population_density', size='total_deaths_per_million',
           color ='continent', hover_name='location',
           title='pop vs pop. density vs total deaths per million')
In [679]:
px.scatter(odf1, x='life_expectancy',y='human_development_index', size='gdp_per_capita',
           color ='continent', hover_name='location',
           title='HDI vs Life_expectancy vs gdp_per_capita')
In [ ]:
odf1.info()
In [689]:
odf1 = odf1[odf1.total_deaths_per_million.notna()]
In [ ]:
odf1[['female_smokers','male_smokers']]
In [ ]:
odf1['smokers'] = odf1.female_smokers + odf1.male_smokers
In [691]:
px.scatter(odf1, x='cardiovasc_death_rate',y='diabetes_prevalence', size='total_deaths_per_million',
           color ='continent', hover_name='location',
           title='cardiovasc vs diabetes vs total_deaths_per_million')
In [692]:
px.scatter(odf1, x='male_smokers',y='female_smokers', size='total_deaths_per_million',
           color ='continent', hover_name='location',
           title='msmoke vs fsmoke vs total_deaths_per_million')
In [681]:
odf1 = odf1[odf1.median_age.notna()]
In [ ]:
odf1.sort_values(by='population_density', ascending=False)[['location','population_density']]
In [ ]:
odf_totalcases.sort_values(by='aged_65_older', ascending=False).head(10)
In [682]:
px.scatter(odf1, x='population',y='population_density', size='median_age',
           color ='continent', hover_name='location',
           title='cardiovasc, diabetes, smokers')

if the population of the country is under 1 million, the value of total_cases_per_million is scaled up proportionally and may be inaccurate

Population density does not look like a large contributing factor to crude mortality rate, when looking at the higher total_deaths_per_million values.

In [ ]:
odfdt_vis = odfdt[odfdt['handwashing_facilities'].notna()]
In [ ]:
px.scatter(odfdt_vis,x='crude_mortality',y='handwashing_facilities',
           color='location', hover_data=['location'],
           title='Handwashing facilities and crude mortality')

The data shows that having more handwashing facilities doesn't necessarily indicate better COVID19 outcomes

In [ ]:
px.scatter(odfdt,x='crude_mortality',y='median_age',
           color='location', hover_data=['location'],
           title='median age and crude mortality')
  • Countries with higher median age are typically hit harder

Bucketizing target variable crude_mortality for classification

  • crude_mortality may be the best target variable as it is not susceptible to over-extrapolation like the total_deaths_per_million or total_cases_per_million variables when total population is much lower than a million.
  • crude mortality is also not reliant on testing capacity as the metric does not take into account confirmed cases, only confirmed deaths which are readily available. Confirmed covid cases will vary in each region based on testing capacity
In [ ]:
odft[odft.crude_mortality > 5.644496e-05].count() #50% of the target values to bucketize the output into 2 groups for classifying
In [ ]:
#bucketized column
odft['outcomes'] = pd.cut(odft['crude_mortality'],[0,5.644496e-05,1],labels = [0,1])
In [ ]:
odft.outcomes.value_counts()

ARIMA preprocessing

In [203]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA, ARIMAResults
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa import stattools
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox
from sklearn.metrics import mean_squared_error
from pandas.core.nanops import nanmean as pd_nanmean
from sklearn.linear_model import LinearRegression
In [204]:
odf.describe()
Out[204]:
total_cases new_cases new_cases_smoothed total_deaths new_deaths new_deaths_smoothed total_cases_per_million new_cases_per_million new_cases_smoothed_per_million total_deaths_per_million ... extreme_poverty cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index crude_mortality
count 7.765700e+04 77655.000000 76660.000000 68208.000000 68366.000000 76660.000000 77233.000000 77231.000000 76241.000000 67797.000000 ... 48898.000000 72486.000000 73453.000000 56811.000000 55965.000000 36192.000000 66230.000000 75193.000000 72433.000000 6.779700e+04
mean 5.043338e+05 3727.931737 3726.836176 14579.275671 92.864289 82.067348 8839.842915 70.944380 70.916309 202.485904 ... 13.337854 257.649986 7.803242 10.560282 32.637032 50.846556 3.033966 73.157608 0.727415 2.024859e-04
std 2.642110e+06 18465.743319 18136.506238 63567.402100 409.244730 370.698916 17196.173140 173.513533 147.023724 357.633691 ... 20.014556 119.003241 3.967510 10.436964 13.524875 31.941807 2.473305 7.572952 0.150521 3.576337e-04
min 1.000000e+00 -74347.000000 -6223.000000 1.000000 -1918.000000 -232.143000 0.001000 -2153.437000 -276.825000 0.001000 ... 0.100000 79.370000 0.990000 0.100000 7.700000 1.188000 0.100000 53.280000 0.394000 7.246354e-10
25% 8.070000e+02 2.000000 6.286000 41.000000 0.000000 0.000000 173.880000 0.149000 1.065000 6.166000 ... 0.500000 167.295000 5.290000 1.900000 21.400000 19.351000 1.300000 67.440000 0.602000 6.166152e-06
50% 8.880000e+03 62.000000 76.429000 274.000000 2.000000 1.143000 1205.329000 6.819000 9.046000 34.827000 ... 2.000000 243.811000 7.110000 6.200000 31.400000 49.839000 2.397000 74.620000 0.750000 3.482700e-05
75% 9.482100e+04 689.000000 718.143000 2596.000000 16.000000 12.286000 8416.598000 62.258000 69.904000 216.070000 ... 21.200000 329.635000 10.080000 19.300000 41.100000 83.241000 4.000000 78.730000 0.848000 2.160696e-04
max 4.077619e+07 346447.000000 287095.143000 930862.000000 7554.000000 5608.286000 159011.195000 8652.658000 2648.773000 2526.571000 ... 77.600000 724.417000 30.530000 44.000000 78.100000 98.999000 13.800000 86.750000 0.957000 2.526571e-03

8 rows × 47 columns

In [205]:
#rolling average for each variable
def cra(country,case='total_cases'):
    ts=odf.loc[(odf['location']==country)]  
    ts=ts[[case]]
    a=len(ts.loc[(ts[case]>=10)])
    ts=ts[-a:]
    ts.astype('int64')
    return (ts.rolling(window=7,center=False).mean().dropna())


def crplot(country, case='total_cases'):
    ts=odf.loc[(odf['location']==country)]  
    ts=ts[[case]]
    a=len(ts.loc[(ts[case]>=10)])
    ts=ts[-a:]
    ts.astype('int64')
    
    plt.figure(figsize=(16,6))
    plt.plot(ts[case])
    plt.plot(ts.rolling(window=7,center=False).mean().dropna(),label='Rolling Mean')
    plt.legend(['Cases', 'Rolling Mean'])
    plt.title('Cases distribution in %s with rolling mean and standard' %country)
In [206]:
crplot('South America', case='total_cases_per_million')
In [207]:
c1 = cra('Canada')
In [208]:
crplot('Canada')
In [209]:
crplot('United States')
In [210]:
odfcra = pd.DataFrame(cra('Canada'))

Stationarity testing

In [606]:
fig = sm.tsa.seasonal_decompose(cra('Canada').values, period = 30).plot()
In [334]:
def stationarity(ts):
    print('Augmented Dickey-Fuller test: Canada\'s total_cases')
    test = adfuller(ts, autolag='AIC')
    results = pd.Series(test[0:4], index=['Test Statistic','p-value','Lags Used','Number of Observations'])
    print(results)

stationarity(c1.total_cases.values)
Augmented Dickey-Fuller test: Canada's total_cases
Test Statistic              1.172919
p-value                     0.995793
Lags Used                  16.000000
Number of Observations    384.000000
dtype: float64

The p-value tells us the likelihood of stationarity. Because this p-value is over the alpha limit, data may be non-stationary. ARIMA is a good model in this case

In [714]:
stationarity(cra('India').total_cases.values)
crplot('India')
Augmented Dickey-Fuller test: Canada's total_cases
Test Statistic             2.23
p-value                    1.00
Lags Used                 16.00
Number of Observations   375.00
dtype: float64
In [715]:
stationarity(cra('China').total_cases.values)
crplot('China')
Augmented Dickey-Fuller test: Canada's total_cases
Test Statistic            -0.52
p-value                    0.89
Lags Used                 18.00
Number of Observations   415.00
dtype: float64

ACF and PACF

In [228]:
def autocorr(ts):
    plt.figure(figsize=(12,5))
    layout = (1, 2)
    
    ax_acf= plt.subplot2grid(layout, (0, 0))
    ax_pacf = plt.subplot2grid(layout, (0, 1))
    
    plot_acf(ts, lags=15, title='ACF', ax=ax_acf)
    plot_pacf(ts, lags=15, title='PACF', ax=ax_pacf)
    plt.tight_layout()
    
autocorr(cra('Canada'))

ARIMA Modeling

In [231]:
import itertools
import warnings
warnings.simplefilter('ignore')
In [527]:
import itertools
import warnings
warnings.simplefilter('ignore')

def split(ts):
    size = int(len(ts) * 0.85)
    train = ts[:size]
    test = ts[size:]
    return(train,test)

#pDq hyperparameter optimization
def arima(ts,test):
    p = d = q = range(0,6)
    x = 99999
    pdq = list(itertools.product(p,d,q))
    
    for combo in pdq:
        try:
            model = ARIMA(ts, order=combo)
            result = model.fit()
            if (result.aic <= x):
                x = result.aic
                param = combo
        except:
            continue
    return param
Out[527]:
'\n    #Modeling\n    model = ARIMA(ts, order=param)\n    result = model.fit()\n    \n    result.plot_predict(start=10, end=int(len(ts) * 1.45))\n    \n    #Printing the error metrics\n    print(result.summary())        \n    return (pred)\n\ntrain,test=split(c1.total_cases.values)\npred=arima(train,test)'
In [ ]:
c1 = cra('Canada',case='total_cases')
c2 = cra('Canada',case='total_deaths')
In [ ]:
train,test=split(c2.total_deaths.values)
param=arima(train,test)

#Modeling
model_totaldeaths = ARIMA(train, order=param)
result_totaldeaths = model.fit()
In [532]:
train,test=split(c1.total_cases.values)
param=arima(train,test)

#Modeling
model_totalcases = ARIMA(train, order=param)
result_totalcases = model.fit()
In [529]:
def rmse(y1, y_pred):
    y1, y_pred = np.array(y1), np.array(y_pred)
    rmse = sqrt(mean_squared_error(y1, y_pred))
    print('Test RMSE: %.3f' % rmse)

rmse(test, pred)
Test RMSE: 19904.024
In [534]:
plt.figure(figsize=(15,10))
layout=(1,1)

forecast = plt.subplot2grid(layout, (0,0))
plt.setp(forecast, xticks=[0,100,200,300,400,460], xticklabels=['March 1st, 2020','','','','April 1st, 2021'])
result.plot_predict(start=10, end=460, ax=forecast)
forecast.legend(fontsize='xx-large', loc='upper left')
forecast.tick_params(axis='x', labelsize=15)
forecast.tick_params(axis='y', labelsize=13)
forecast.legend(['forecast','actual','95% CI'], loc = 'upper left', fontsize='xx-large')
forecast.set_ylabel('COVID19 Deaths', fontsize=15)
forecast.set_title('ARIMA Model Predictions for COVID19 Deaths in Canada', fontsize=20 )
Out[534]:
Text(0.5, 1.0, 'ARIMA Model Predictions for COVID19 Deaths in Canada')
In [540]:
pred=result.forecast(steps=len(test))[0]
fig, ax = plt.subplots(1, 2, figsize=(16,4))

ax[0].plot(pred, c='g', label = 'Predicted')
ax[0].plot(test, c='r', label = 'Actual')
ax[1].plot(pred-test, c='b', label = 'Residuals', marker='*')
ax[0].legend(loc='upper left', fontsize='x-large')
ax[1].legend(loc='upper left', fontsize='x-large')
plt.setp(ax[0], xticks=[0, 10, 20, 30, 40, 50,56], xticklabels=['February 1st, 2021','','','March 1st, 2021','','','April 1st, 2021'])
ax[1].tick_params(axis='x', labelsize=13)
fig.suptitle('ARIMA Model error', fontsize=20 )
Out[540]:
Text(0.5, 0.98, 'ARIMA Model error')
In [501]:
print(result_totalcases.summary())
                             ARIMA Model Results                              
==============================================================================
Dep. Variable:                   D2.y   No. Observations:                  338
Model:                 ARIMA(4, 2, 5)   Log Likelihood               -2130.974
Method:                       css-mle   S.D. of innovations            130.928
Date:                Mon, 05 Apr 2021   AIC                           4283.947
Time:                        23:40:53   BIC                           4326.001
Sample:                             2   HQIC                          4300.707
                                                                              
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.3560     15.530      0.667      0.505     -20.082      40.794
ar.L1.D2.y     0.5466      0.119      4.591      0.000       0.313       0.780
ar.L2.D2.y    -0.6372      0.081     -7.867      0.000      -0.796      -0.478
ar.L3.D2.y     0.5814      0.096      6.044      0.000       0.393       0.770
ar.L4.D2.y    -0.0808      0.090     -0.896      0.370      -0.258       0.096
ma.L1.D2.y    -0.3352      0.106     -3.174      0.002      -0.542      -0.128
ma.L2.D2.y     0.4566      0.070      6.521      0.000       0.319       0.594
ma.L3.D2.y    -0.3361      0.094     -3.578      0.000      -0.520      -0.152
ma.L4.D2.y    -0.0046      0.077     -0.061      0.952      -0.155       0.145
ma.L5.D2.y     0.5162      0.048     10.767      0.000       0.422       0.610
                                    Roots                                    
=============================================================================
                  Real          Imaginary           Modulus         Frequency
-----------------------------------------------------------------------------
AR.1           -0.1796           -1.1418j            1.1558           -0.2748
AR.2           -0.1796           +1.1418j            1.1558            0.2748
AR.3            1.5404           -0.0000j            1.5404           -0.0000
AR.4            6.0120           -0.0000j            6.0120           -0.0000
MA.1            0.9498           -0.6463j            1.1488           -0.0951
MA.2            0.9498           +0.6463j            1.1488            0.0951
MA.3           -0.2114           -0.9774j            1.0000           -0.2839
MA.4           -0.2114           +0.9774j            1.0000            0.2839
MA.5           -1.4678           -0.0000j            1.4678           -0.5000
-----------------------------------------------------------------------------
In [ ]:
print(result_totaldeaths.summary())

Data Mining experimenting

Random Forest Regressor

In [ ]:
#Most recent data
odfdt = odft.copy()
odfdt['date'] = pd.to_datetime(odfdt['date'])
cutoff_date = pd.Timestamp('2021-02-01 00:00:00');

#Shortened dataset to only the most recent date to remove all time-related values
odfdt = odfdt[odfdt.date == cutoff_date]
In [545]:
odft = odf_totalcases.copy()
In [570]:
list_nocol = [col for col in odft.columns if 'new' in col or 'per' in col or 'people' in col or 'aged' in col]
In [572]:
list_nocol = ['new_cases',
 'new_cases_smoothed',
 'new_deaths',
 'new_deaths_smoothed',
 'total_cases_per_million',
 'new_cases_per_million',
 'new_cases_smoothed_per_million',
 'total_deaths_per_million',
 'new_deaths_per_million',
 'new_deaths_smoothed_per_million',
 'new_tests',
 'total_tests_per_thousand',
 'new_tests_per_thousand',
 'new_tests_smoothed',
 'new_tests_smoothed_per_thousand',
 'tests_per_case',
 'people_vaccinated',
 'people_fully_vaccinated',
 'new_vaccinations',
 'new_vaccinations_smoothed',
 'total_vaccinations_per_hundred',
 'people_vaccinated_per_hundred',
 'people_fully_vaccinated_per_hundred',
 'new_vaccinations_smoothed_per_million',
 'aged_65_older',
 'aged_70_older']
In [587]:
y=odft.total_deaths
X=odft[odft.columns.difference(list_nocol+['iso_code','location','continent','crude_mortality','total_deaths','positive_rate','population','total_cases'])] 
In [588]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
In [589]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp = imp.fit(train_X)
train_X = imp.transform(train_X)

valimp = SimpleImputer(missing_values=np.nan, strategy='median')
valimp = valimp.fit(val_X)
val_X = valimp.transform(val_X)


val_y = val_y.fillna(val_y.median())
train_y = train_y.fillna(train_y.median())
In [590]:
clf = RandomForestRegressor(n_estimators=100, bootstrap = True, oob_score = True, random_state = 1)
clf = clf.fit(train_X, train_y)
In [591]:
print('R^2 training set: {:.2f} \nR^2 val set: {:.2f}'.format(clf.score(train_X, train_y),clf.score(val_X, val_y)))
R^2 training set: 0.92 
R^2 val set: -8.95
In [592]:
val_pred = clf.predict(val_X)
val_mae = mean_absolute_error(val_pred, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))
Validation MAE: 9,345

Extremely high R^2 score and very low MAE suggests severe overfitting of the model

In [593]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range (train_X.shape[1]):
    featurelist = []
    featurelist.append(X.columns[indices[f]])
    print(f + 1,"\t", X.columns[indices[f]], importances[indices[f]])
    
plt.figure()
plt.title("Feature importances")
plt.bar(range(train_X.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(train_X.shape[1]), indices)
plt.xlim([-1, train_X.shape[1]])
plt.show()
Feature ranking:
1 	 total_vaccinations 0.660178764523194
2 	 total_tests 0.13050976261460176
3 	 male_smokers 0.05460501739509316
4 	 gdp_per_capita 0.02405079610192234
5 	 diabetes_prevalence 0.021522416526784896
6 	 population_density 0.02008124809192595
7 	 reproduction_rate 0.017109290445791167
8 	 stringency_index 0.012007471790289884
9 	 hospital_beds_per_thousand 0.01101213508077598
10 	 median_age 0.010652424752273347
11 	 human_development_index 0.01036438899981282
12 	 life_expectancy 0.006634395312292519
13 	 extreme_poverty 0.006249585675342656
14 	 female_smokers 0.005142398595283959
15 	 handwashing_facilities 0.004981284253069588
16 	 cardiovasc_death_rate 0.0048986198415459465

The model itself is too biased for these feature importances to hold significance.

Multiple Regression

In [ ]:
y=odft.outcomes
X=odft[odft.columns.difference(['total_deaths_per_million','total_deaths','total_cases_per_million','date',
                              'tests_units','continent','reproduction_rate','crude_mortality','location','outcomes'])] 
In [ ]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
In [ ]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp = imp.fit(train_X)
train_X = imp.transform(train_X)

valimp = SimpleImputer(missing_values=np.nan, strategy='median')
valimp = valimp.fit(val_X)
val_X = valimp.transform(val_X)
##train_y2 = train_y.fillna(train_y.median())
In [ ]:
model = sm.OLS(train_y, train_X).fit()
In [ ]:
predictions = model.predict(train_X)
In [ ]:
model.summary()
In [ ]:
print(X.columns[8], X.columns[7], X.columns[2])

life expectancy, human_development_index and extreme_poverty look like the strongest candidates for feature importance from the linear regression, however these results should be interpreted with caution.

Logistic Regression

In [ ]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
In [ ]:
#most recent date data
y=odfdt.outcomes
X=odfdt[odfdt.columns.difference(['total_deaths_per_million','total_cases_per_million','date',
                              'tests_units','continent','reproduction_rate','crude_mortality','location','outcomes'])] 
In [ ]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
In [ ]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp = imp.fit(train_X)
train_X = imp.transform(train_X)

valimp = SimpleImputer(missing_values=np.nan, strategy='median')
valimp = valimp.fit(val_X)
val_X = valimp.transform(val_X)
In [ ]:
lrModel = LogisticRegression()
lrModel.fit(train_X, train_y)
In [ ]:
y_pred = lrModel.predict(val_X)
print('Accuracy: {:.2f}'.format(lrModel.score(val_X, val_y)))
In [ ]:
print(classification_report(val_y, y_pred))

kNN

  • using only most recent time data
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
In [ ]:
#Most recent data
odfdt = odft.copy()
odfdt['date'] = pd.to_datetime(odfdt['date'])
cutoff_date = pd.Timestamp('2021-02-01 00:00:00');

#Shortened dataset to only the most recent date to remove all time-related values
odfdt = odfdt[odfdt.date == cutoff_date]
In [ ]:
y=odfdt.outcomes
X=odfdt[odfdt.columns.difference(['total_deaths_per_million','total_cases_per_million','date',
                              'tests_units','continent','reproduction_rate','crude_mortality','location','outcomes'])] 
In [ ]:
sc = StandardScaler()
X_train = sc.fit_transform(train_X)
X_test = sc.transform(val_X)
In [ ]:
kNNmodel = KNeighborsClassifier(n_neighbors = 5)
kNNmodel.fit(X_train, train_y)
In [ ]:
y_pred = kNNmodel.predict(X_test)
In [ ]:
from sklearn.metrics import classification_report
In [ ]:
cmatrix = confusion_matrix(val_y, y_pred)
cmatrix
In [ ]:
print(classification_report(val_y, y_pred))

kNN

  • using processed dataset
In [ ]:
y=odft.outcomes
X=odft[odft.columns.difference(['total_deaths_per_million','total_cases_per_million','date',
                              'tests_units','continent','reproduction_rate','crude_mortality','location','outcomes'])] 
In [ ]:
sc = StandardScaler()
X_train = sc.fit_transform(train_X)
X_test = sc.transform(val_X)
In [ ]:
kNNmodel = KNeighborsClassifier(n_neighbors = 50)
kNNmodel.fit(X_train, train_y)
In [ ]:
y_pred = kNNmodel.predict(X_test)
In [ ]:
from sklearn.metrics import classification_report
In [ ]:
cmatrix = confusion_matrix(val_y, y_pred)
cmatrix
In [ ]:
print(classification_report(val_y, y_pred))
#overfitting again